{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-19T09:40:35.023114Z",
     "start_time": "2020-06-19T09:40:34.897614Z"
    }
   },
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-19T09:43:56.917470Z",
     "start_time": "2020-06-19T09:43:56.734129Z"
    }
   },
   "outputs": [],
   "source": [
    "from utils import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-19T09:41:37.885693Z",
     "start_time": "2020-06-19T09:41:37.583423Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "content = pd.read_csv('qa_corpus.csv', encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-19T10:02:56.025381Z",
     "start_time": "2020-06-19T10:02:53.443216Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import random\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import zhon.hanzi\n",
    "# pip install zhon"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-19T12:25:19.927942Z",
     "start_time": "2020-06-19T12:25:19.681250Z"
    }
   },
   "outputs": [],
   "source": [
    "def split_to_sentence(content):\n",
    "    \"分句\"\n",
    "    pattern = re.compile(\".*?[。；？！]\")  # 非贪婪模式匹配文字内容\n",
    "    sents = re.findall(pattern, content)\n",
    "    return sents\n",
    "\n",
    "\n",
    "def basic_clean(content):\n",
    "    content = content.lower()\n",
    "\n",
    "    # 去除重复字符\n",
    "    content = re.sub('(\\n)+','\\n',content)\n",
    "    content = re.sub('(\\r)+','',content)\n",
    "    content = re.sub('(\\t)+','，',content)\n",
    "\n",
    "    content = re.sub('[？]+?[！]+?','',content)\n",
    "    content = re.sub('[？]+?[。]+?','',content)\n",
    "    content = re.sub('(～)+','～',content)\n",
    "    content = re.sub('(，)+','，',content)\n",
    "    content = re.sub('(。)+','。',content)\n",
    "    content = re.sub('(～。)+','。',content)\n",
    "    content = re.sub('(？)+','？',content)\n",
    "    content = re.sub('(！)+','！',content)\n",
    "    content = re.sub('(；)+','；',content)\n",
    "    content = re.sub('( )+',' ',content).strip()\n",
    "\n",
    "    return content\n",
    "\n",
    "\n",
    "def clean_data_pretrain(row_of_pd):\n",
    "    '''清洗文本\n",
    "\n",
    "    输出为Bert pretrain要求的格式，转换为一个句子输入bert classification也很容易。\n",
    "    Input file format:\n",
    "    (1) **One sentence per line**.\n",
    "    (2) **Blank lines between documents**.\n",
    "    '''\n",
    "    q = str(row_of_pd['question'])\n",
    "    a = str(row_of_pd['answer'])\n",
    "\n",
    "    content_q = basic_clean(q)\n",
    "    content_a = basic_clean(a)\n",
    "\n",
    "    if content_a[-1] not in \"。？！.?!\":\n",
    "        content_a += '。'\n",
    "\n",
    "    target_format_string = ''\n",
    "    target_format_string += content_q + '\\n'\n",
    "    \n",
    "    paras = content_a.split('\\n')\n",
    "    for para in paras:\n",
    "        sents = split_to_sentence(para)\n",
    "        if len(sents) == 0:\n",
    "            target_format_string += para + '\\n'\n",
    "            continue\n",
    "        for sent in sents:\n",
    "            if  len(sent) < 5:  # 短句合并\n",
    "                target_format_string = target_format_string.strip() + sent + '\\n'\n",
    "            else:\n",
    "                 target_format_string += sent + '\\n'\n",
    "    target_format_string += '\\n'  # 处理为pretrain 要求的格式\n",
    "\n",
    "    return target_format_string\n",
    "\n",
    "\n",
    "def process_data_for_pretrain(data_path='qa_corpus.csv', target_path='pretrain/', num_files=10):\n",
    "    \"处理为pretrain的文件格式\"\n",
    "    if not os.path.exists(target_path):\n",
    "        os.makedirs(target_path)\n",
    "\n",
    "    data = pd.read_csv(data_path, encoding='utf-8')\n",
    "    data = data.sample(frac=1.0)\n",
    "    count = 0\n",
    "    target_file_list = []\n",
    "    for i in range(num_files):\n",
    "        target_file = open(target_path + \"bert_\" + str(i) + \"_pretrain.txt\", \"w\", encoding='utf-8')\n",
    "        target_file_list.append(target_file)\n",
    "\n",
    "    for index, row in data.iterrows():\n",
    "        reminder = count % num_files\n",
    "        content = clean_data_pretrain(row)\n",
    "        target_file_list[reminder].write(content)\n",
    "        count += 1\n",
    "\n",
    "        if index % 6000 == 0: print(content)\n",
    "\n",
    "    for i in range(num_files):\n",
    "        target_file_list[i].close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-19T12:25:27.755390Z",
     "start_time": "2020-06-19T12:25:20.800514Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "thankyouverymuch\n",
      "youarewelcome.\n",
      "\n",
      "\n",
      "余额分期的收费标准\n",
      "目前我行余额分期的分期付款手续费费率会由系统根据您的用卡情况进行评估给出，具体分期付款手续费费率及分期期数请以业务办理时系统显示为准。\n",
      "\n",
      "\n",
      "你少说两句吧\n",
      "皇上，臣妾做不到啊。\n",
      "\n",
      "\n",
      "至短反长\n",
      "其实做一个低调的孩子很幸福。\n",
      "\n",
      "\n",
      "补发网银盾\n",
      "您可以到智慧柜员机办理此项业务，方便快捷，节省您的宝贵时间哦。\n",
      "若去柜台办理，请刷身份证取号。\n",
      "\n",
      "\n",
      "人在江湖混\n",
      "怎能不嚣张！呵呵呵。\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "process_data_for_pretrain()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-19T12:13:25.639106Z",
     "start_time": "2020-06-19T12:13:25.433008Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "T/T汇款\n",
      "tcpip协议\n",
      "tnnd\n",
      "thank you\n",
      "Thank\n",
      "Thankyouverymuch\n",
      "Thankyou\n",
      "thx\n",
      "tks\n",
      "TD时你最想要我的哪一招？\n",
      "thanks\n"
     ]
    }
   ],
   "source": [
    "for i in list(content.question):\n",
    "    if str(i).startswith('t') or str(i).startswith('T'):\n",
    "        print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
